In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
# convert biom table to tab delimited file in bash with 'taxonomy' information remained
# biom convert -i reference-hit.tax.biom -o table.from_biom.txt --to-tsv --header-key 'taxonomy'
In [3]:
bt = pd.read_csv('../../data/table.from_biom.txt', sep='\t', index_col='#OTU ID', skiprows=1)
In [4]:
print(bt.shape)
bt.head()
Out[4]:
In [5]:
print(bt.taxonomy.str.len().min())
print(bt.taxonomy.str.len().max())
In [6]:
bt.to_csv('../data/biomtable.txt', sep='\t')
In [7]:
mf = pd.read_csv('../data/mapping_MrOS.txt', sep='\t', dtype=str, index_col='#SampleID')
In [8]:
print(mf.shape)
mf.head()
Out[8]:
In [9]:
vars_cat = np.array(['BarcodeSequence', 'LinkerPrimerSequence', 'Experiment_Design_Description',
'Library_Construction_Protocol', 'Linker', 'Platform', 'Center_Name', 'Center_Project', 'Instrument_Model',
'Title', 'Anonymized_Name', 'Scientific_Name', 'Taxon_ID', 'Sample_Type', 'Geo_Loc_Name', 'Elevation', 'Env_Biome',
'Env_Feature', 'Env_Material', 'Env_Package', 'Collection_Timestamp', 'DNA_Extracted', 'Physical_Specimen_Location',
'Physical_Specimen_Remaining', 'Age_Units', 'Host_Subject_ID', 'Host_Taxid','Host_Scientific_Name', 'Host_Common_Name',
'Life_Stage', 'Sex', 'Height_Units', 'Weight_Units', 'Body_Habitat', 'Body_Site', 'Body_Product', 'GIERACE', 'SITE',
'TUDRAMT', 'TURSMOKE', 'M1ADEPR', 'M1VITMND', 'M1ANTIB', 'M1PROBI', 'OHSEAS', 'VDstatus', 'Description',
'OHV1D2CT', 'OHVD2CT'])
vars_cts = np.array(['Latitude', 'Longitude', 'Age', 'Height', 'Weight', 'BMI', 'PASCORE', 'DTVITD',
'OHV1D3', 'OHV24D3', 'OHVD3', 'OHVD2', 'OHV1D2', 'OHVDTOT', 'OHV1DTOT'])
In [10]:
# convert vars_cts to numeric and vars_cat to factors
df = mf.copy()
df[vars_cts] = df[vars_cts].apply(pd.to_numeric, errors='coerce')
df[vars_cat] = df[vars_cat].apply(lambda x: x.astype('category'))
In [11]:
# convert all pg/ml to ng/ml note: 1 ng/ml = 1000 pg/ml
df.OHV1D3 = df.OHV1D3/1000
df.OHV1D2 = df.OHV1D2/1000
df.OHV1DTOT = df.OHV1DTOT/1000
In [12]:
#df.M1ANTIB.value_counts()
In [13]:
# df['ratio_activation'] = df.OHV1D3/(df.OHVD3*1000) # pg/ml vs. ng/ml
# df['ratio_catabolism'] = df.OHV24D3/df.OHVD3 # both ng/ml
df['ratio_activation'] = df.OHV1D3/df.OHVD3
df['ratio_catabolism'] = df.OHV24D3/df.OHVD3
vars_cts = np.append(vars_cts, ['ratio_activation', 'ratio_catabolism'])
In [14]:
df[vars_cts].describe()
Out[14]:
In [15]:
df[vars_cat].describe()
Out[15]:
In [16]:
df[vars_cts].isnull().sum()
Out[16]:
In [17]:
# for i in range(len(vars_cat)):
# print(df[vars_cat[i]].value_counts())
In [18]:
# check
print(mf.shape)
print(df.shape)
In [19]:
df.to_csv('../data/mapping_cleaned_MrOS.txt', sep= '\t', index=True)
In [ ]: